Days between publication date and appearing on best seller list (Filtered looking just at Fiction) - using Alexis’ data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)

library(ggplot2)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
# Load dataset
nyt_bestseller <- read.csv("NYT_best_seller .csv", stringsAsFactors = FALSE)

# Filter for fiction books (assuming fiction-related categories contain 'Fiction' in 'list_name')
fiction_df <- subset(nyt_bestseller, grepl("Fiction", list_name, ignore.case = TRUE))

# Convert 'bestsellers_date' and 'published_date' to Date format
fiction_df$bestsellers_date <- as.Date(fiction_df$bestsellers_date, format="%m/%d/%y")
fiction_df$published_date <- as.Date(fiction_df$published_date, format="%m/%d/%y")

# Filter for books published between 2010 and 2016
fiction_df <- subset(fiction_df, format(published_date, "%Y") >= 2010 & format(published_date, "%Y") <= 2016)

# Calculate time difference in days between publication and best seller date
fiction_df$days_to_best_seller <- as.numeric(difftime(fiction_df$bestsellers_date, fiction_df$published_date, units="days"))

# Create improved scatter plot with trend line
ggplot(fiction_df, aes(x = published_date, y = days_to_best_seller)) +
  geom_point(aes(color = days_to_best_seller), alpha = 0.6, size = 2) +  # Add color gradient based on days
  geom_smooth(method = "loess", se = TRUE, color = "red", size = 1) +   # Add trend line (LOESS)
  scale_color_gradient(low = "blue", high = "red") +                     # Gradient from blue (low) to red (high)
  scale_x_date(labels = date_format("%Y"), breaks = "1 year") +          # Format x-axis to show years
  labs(title = "Days to Bestseller vs. Publication Date (2010-2016)",
       x = "Publication Date",
       y = "Days to Become Bestseller",
       color = "Days to Bestseller") +
  theme_minimal(base_size = 14) +                                        # Clean theme with larger text
  theme(axis.text.x = element_text(angle = 45, hjust = 1))               # Rotate x-axis labels for readability
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

This data set only had three categories: Trade Fiction, Mass Market, and Hardcover Fiction. This first graph looks at Trade Fiction and Hardcover Fiction best sellers published from years 2010-2016, and plots the days until days the books reach bestseller list. The range between publication date and days to bestseller is 14 to 15 days.

Includes all categories in Alexis’ data

# Load necessary libraries
library(ggplot2)
library(scales)  # For formatting dates



# Convert 'bestsellers_date' and 'published_date' to Date format
nyt_bestseller$bestsellers_date <- as.Date(nyt_bestseller$bestsellers_date, format="%m/%d/%y")
nyt_bestseller$published_date <- as.Date(nyt_bestseller$published_date, format="%m/%d/%y")

# Filter for books published between 2010 and 2016
nyt_bestseller <- subset(nyt_bestseller, format(published_date, "%Y") >= 2010 & format(published_date, "%Y") <= 2016)

# Calculate time difference in days between publication and best seller date
nyt_bestseller$days_to_best_seller <- as.numeric(difftime(nyt_bestseller$bestsellers_date, nyt_bestseller$published_date, units="days"))

# Create improved scatter plot with trend line
ggplot(nyt_bestseller, aes(x = published_date, y = days_to_best_seller)) +
  geom_point(aes(color = days_to_best_seller), alpha = 0.6, size = 2) +  # Add color gradient based on days
  geom_smooth(method = "loess", se = TRUE, color = "red", size = 1) +   
  scale_color_gradient(low = "blue", high = "red") +                     # Gradient from blue (low) to red (high)
  scale_x_date(labels = date_format("%Y"), breaks = "1 year") +          # Format x-axis to show months
  labs(title = "Days to Bestseller vs. Publication Date (2010-2016)",
       x = "Publication Date",
       y = "Days to Become Bestseller",
       color = "Days to Bestseller") +
  theme_minimal(base_size = 14) +                                        # Clean theme with larger text
  theme(axis.text.x = element_text(angle = 45, hjust = 1))               # Rotate x-axis labels for readability
## `geom_smooth()` using formula = 'y ~ x'

This graph looks at Hardcover Fiction,Trade Fiction, and Mass Market. There is similiar range of days between publication date and reaching bestseller list.

library(ggplot2)
library(scales)

ggplot(nyt_bestseller, aes(x = published_date, y = days_to_best_seller)) +
  geom_point(aes(color = days_to_best_seller), alpha = 0.6, size = 2) +  
  geom_smooth(method = "loess", se = TRUE, color = "red", size = 1) +   
  scale_color_gradient(low = "blue", high = "red") +                     
  scale_x_date(labels = date_format("%b %Y"), breaks = date_breaks("3 months")) +  # show abbreviated months and years
  labs(title = "Days to Bestseller vs. Publication Date (2010-2016)",
       x = "Publication Date",
       y = "Days to Become Bestseller",
       color = "Days to Bestseller") +
  theme_minimal(base_size = 14) +                                        
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## `geom_smooth()` using formula = 'y ~ x'

Added months to the y-axis for better readability.

library(dplyr)
print(nyt_bestseller$days_to_best_seller)
##   [1] -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14
##  [19] -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14
##  [37] -14 -14 -14 -14 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
##  [55] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
##  [73] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
##  [91] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [109] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [127] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [145] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [163] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [181] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [199] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [217] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [235] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -14 -14
## [253] -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14
## [271] -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14
## [289] -14 -14 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [307] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [325] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [343] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [361] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [379] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [397] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [415] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [433] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [451] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [469] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [487] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -14 -14 -14 -14
## [505] -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14
## [523] -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14
## [541] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [559] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [577] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [595] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [613] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [631] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [649] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [667] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [685] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [703] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [721] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [739] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [757] -15 -15 -15 -15

Top 10 publishers - Using Alexis’ data (Filtered for 2010-2016)

# Filter for 2010-2016 

# Load required libraries
library(dplyr)
library(readr)

# Read the dataset
df <- read_csv("NYT_best_seller_08_16.csv", show_col_types = FALSE)

# Convert published_date to Date format (correcting MM/DD/YY format)
df$published_date <- as.Date(df$published_date, format="%m/%d/%y")

# Check date range before filtering
print(range(df$published_date, na.rm = TRUE))
## [1] "2008-06-08" "2016-06-12"
# Filter data to only include books published between 2010 and 2016
df_filtered <- df %>%
  filter(published_date >= as.Date("2010-01-01") & published_date <= as.Date("2016-12-31"))

# Count the number of books per publisher
publisher_counts_2010_2016 <- df_filtered %>%
  count(publisher, name = "n") %>%
  arrange(desc(n))

# Ensure data exists before saving
print(dim(publisher_counts_2010_2016))
## [1] 115   2
print(head(publisher_counts_2010_2016))
## # A tibble: 6 × 2
##   publisher         n
##   <chr>         <int>
## 1 Grand Central    63
## 2 Bantam           54
## 3 Berkley          40
## 4 Vintage          28
## 5 Ballantine       27
## 6 Putnam           26
# Display filtered dataset
print(publisher_counts_2010_2016)
## # A tibble: 115 × 2
##    publisher            n
##    <chr>            <int>
##  1 Grand Central       63
##  2 Bantam              54
##  3 Berkley             40
##  4 Vintage             28
##  5 Ballantine          27
##  6 Putnam              26
##  7 Dell                25
##  8 Little, Brown       24
##  9 Simon & Schuster    20
## 10 Knopf               19
## # ℹ 105 more rows

Visualization for Publisher counts in NYT Best Sellers - Alexis’ Data (2010-2016)

ggplot(publisher_counts_2010_2016, aes(x = reorder(publisher, n), y = n)) + 
  geom_bar(stat = "identity", fill = "skyblue") + 
  coord_flip() + 
  labs(title = "Publisher Counts in NYT Best Sellers", 
       subtitle = "Data from 2010-2016",
       x = "Publisher",
       y = "Number of Best Sellers") +
  theme_minimal() +
    theme(
    plot.title = element_text(size = 18, face = "bold"),  # Increase title size
    axis.title.x = element_text(size = 16),  # Increase x-axis title size
    axis.title.y = element_text(size = 16),  # Increase y-axis title size
    axis.text.y = element_text(size = 14),   # Increase y-axis text size (publisher names)
    axis.text.x = element_text(size = 14)    # Increase x-axis text size
  )

Top 10 Publishers 2010-2016

top_publishers <- publisher_counts_2010_2016 %>%
  arrange(desc(n)) %>%
  head(10)  # Keep only the top 10 publishers

# Create the ggplot visualization
ggplot(top_publishers, aes(x = reorder(publisher, n), y = n)) + 
  geom_bar(stat = "identity", fill = "red") + 
  coord_flip() + 
  labs(title = "Top 10 Publishers in NYT Best Sellers", 
       subtitle = "Data from 2010-2016",
       x = "Publisher",
       y = "Number of Best Sellers") +
  theme_minimal()

Grand Central, Bantam, Berkley, Vintage, Ballantine, Putnam, Dell, Little, Brown, Simon & Schuster, Knopf have most NYT best sellers from 2010-2016

Month of publication for best sellers (not looking at categories), using Lucas’ data. UNCLEAN

# Load necessary libraries
library(ggplot2)
library(dplyr)
library(lubridate)

# Load the dataset
data <- read.csv("Updated_Bestsellers_Data_Cleaned.csv", stringsAsFactors = FALSE)

# Convert published_date to Date format
data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")

# Extract month name from published_date
data$month <- format(data$published_date, "%B")

# Order months 
data$month <- factor(data$month, levels = month.name, ordered = TRUE)

# Count number of books published per month
monthly_counts <- data %>%
  group_by(month) %>%
  summarise(count = n())

# Create bar chart
ggplot(monthly_counts, aes(x = month, y = count, fill = month)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  labs(title = "Frequency of Bestseller Books Released Per Month",
       x = "Month",
       y = "Number of Bestsellers") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Stacked bar. UNCLEAN

updated_categories <- read.csv("Updated_Bestsellers_Data_Cleaned.csv")


library(ggplot2)
library(dplyr)
library(lubridate)

# Load the updated dataset
data <- read.csv("Updated_Bestsellers_Data_Cleaned.csv", stringsAsFactors = FALSE)

# Convert the published_date column to Date format and extract the month
data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")
data$month <- format(data$published_date, "%B")  # Extract full month name

# Order months correctly
data$month <- factor(data$month, levels = month.name)

# Create the visualization
ggplot(data, aes(x = month, fill = New_Category)) +
  geom_bar() +
  theme_minimal() +
  labs(title = "Distribution of Bestsellers by Month",
       x = "Month",
       y = "Count of Bestsellers",
       fill = "Category") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Heat map. UNCLEANED DATA

library(ggplot2)
library(dplyr)
library(lubridate)

# Load the dataset
data <- read.csv("Updated_Bestsellers_Data_Cleaned.csv", stringsAsFactors = FALSE)


data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")
data$month <- format(data$published_date, "%B")

# Order months correctly
data$month <- factor(data$month, levels = month.name)

# Count occurrences of each category per month
heatmap_data <- data %>%
  count(month, New_Category)

# Create heatmap
ggplot(heatmap_data, aes(x = month, y = New_Category, fill = n)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  theme_minimal() +
  labs(title = "Heatmap of Bestsellers by Month and Category", 
       x = "Month", 
       y = "Category", 
       fill = "Count") + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Stacked area chart. UNCLEANED DATA

library(ggplot2)
library(dplyr)
library(lubridate)

# Load the dataset
data <- read.csv("Updated_Bestsellers_Data_Cleaned.csv", stringsAsFactors = FALSE)

# Convert date column and extract month
data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")
data$month <- format(data$published_date, "%B")

# Order months correctly
data$month <- factor(data$month, levels = month.name)

# Count occurrences of each category per month
area_chart_data <- data %>%
  count(month, New_Category)

# Create stacked area chart
ggplot(area_chart_data, aes(x = month, y = n, fill = New_Category, group = New_Category)) +
  geom_area(position = "stack", alpha = 0.7) +
  theme_minimal() +
  labs(title = "Stacked Area Chart of Bestsellers by Month",
       x = "Month",
       y = "Count of Bestsellers",
       fill = "Category") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

USING COMPLETELY CLEAN DATA TO VISUALIZE MONTHS OF BEST SELLERS

# Per month 
data <- read.csv("dataclean_updated.csv", stringsAsFactors = FALSE)

# Convert published_date to Date format
data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")

# Extract month name from published_date
data$month <- format(data$published_date, "%B")

# Order months correctly
data$month <- factor(data$month, levels = month.name, ordered = TRUE)

# Count number of books published per month
monthly_counts <- data %>%
  group_by(month) %>%
  summarise(count = n())

# Create bar chart
ggplot(monthly_counts, aes(x = month, y = count, fill = month)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  labs(title = "Frequency of Bestseller Books Released Per Month",
       x = "Month",
       y = "Number of Bestsellers") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Books released in February appear the least in the bestseller list, while October has the most.

Stacked Area Chart of categories by month with clean data

data <- read.csv("dataclean_updated.csv", stringsAsFactors = FALSE)

# Convert date column and extract month
data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")
data$month <- format(data$published_date, "%B")

# Order months correctly
data$month <- factor(data$month, levels = month.name)

# Count occurrences of each category per month
area_chart_data <- data %>%
  count(month, New_Category)

# Create stacked area chart
ggplot(area_chart_data, aes(x = month, y = n, fill = New_Category, group = New_Category)) +
  geom_area(position = "stack", alpha = 0.7) +
  theme_minimal() +
  labs(title = "Stacked Area Chart of Bestsellers by Month",
       x = "Month",
       y = "Count of Bestsellers",
       fill = "Category") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Fiction, Manga/Graphic Novels, and Nonfiction categories have the most books on the bestseller list year round compared to the three other categories.

Heat map of Bestsellers by month and category with clean data

data <- read.csv("dataclean_updated.csv", stringsAsFactors = FALSE)

# Convert date column and extract month
data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")
data$month <- format(data$published_date, "%B")

# Order months correctly
data$month <- factor(data$month, levels = month.name)

# Count occurrences of each category per month
heatmap_data <- data %>%
  count(month, New_Category)

# Create heatmap
ggplot(heatmap_data, aes(x = month, y = New_Category, fill = n)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "lightblue", high = "darkblue") +
  theme_minimal() +
  labs(title = "Heatmap of Bestsellers by Month and Category", 
       x = "Month", 
       y = "Category", 
       fill = "Count") + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

There is little seasonality for books categorized as Self-Improvement and Other. Suprisingly, books categorized as Young Audiences peak on the bestseller list in October. Fiction books peak in March, April, and September and are most frequently on the bestseller list year round compared to other categories.

Distribution of bestellers by month and category with clean data

data <- read.csv("dataclean_updated.csv", stringsAsFactors = FALSE)

# Convert the published_date column to Date format and extract the month
data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")
data$month <- format(data$published_date, "%B")  

# Order months correctly
data$month <- factor(data$month, levels = month.name)

# Create the visualization
ggplot(data, aes(x = month, fill = New_Category)) +
  geom_bar() +
  theme_minimal() +
  labs(title = "Distribution of Bestsellers by Month",
       x = "Month",
       y = "Count of Bestsellers",
       fill = "Category") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Show distribution of each category per month. Can see that Younger Audiences, Self-Improvement, and Other have least amount of bestsellers year round. Limited seasonality in all categories.

Count for each author on best seller list - Using Lucas’ Data (CLEANED)

nyt_authors <- read.csv("dataclean_updated.csv", stringsAsFactors = FALSE)

# Count occurrences of each author 
author_counts <- nyt_authors %>% 
  group_by(author) %>% 
  summarise(count = n ()) %>% 
  arrange(desc(count)) 

print(author_counts)
## # A tibble: 3,045 × 2
##    author                            count
##    <chr>                             <int>
##  1 Danielle Steel                      113
##  2 David Baldacci                      100
##  3 Nora Roberts                         80
##  4 John Grisham                         74
##  5 James Patterson                      68
##  6 Debbie Macomber                      67
##  7 James Patterson and Maxine Paetro    63
##  8 John Sandford                        58
##  9 Stuart Woods                         57
## 10 Janet Evanovich                      55
## # ℹ 3,035 more rows

Author and their respective catgeory

# Load necessary libraries
library(dplyr)
library(tibble)

# Load dataset
nyt_authors <- read.csv("dataclean_updated.csv", stringsAsFactors = FALSE)

# Count occurrences of each author within each category
author_counts <- nyt_authors %>% 
  group_by(author, New_Category) %>% 
  summarise(count = n(), .groups = "drop") %>% 
  arrange(desc(count))

# Convert to tibble for better display
author_tibble <- as_tibble(author_counts)

# Print the tibble
print(author_tibble)
## # A tibble: 3,351 × 3
##    author                            New_Category count
##    <chr>                             <chr>        <int>
##  1 Danielle Steel                    Fiction         90
##  2 David Baldacci                    Fiction         81
##  3 Nora Roberts                      Fiction         71
##  4 John Grisham                      Fiction         59
##  5 Stuart Woods                      Fiction         57
##  6 James Patterson                   Fiction         52
##  7 James Patterson and Maxine Paetro Fiction         51
##  8 John Sandford                     Fiction         47
##  9 Stephen King                      Fiction         47
## 10 Lee Child                         Fiction         43
## # ℹ 3,341 more rows

Visualization of top 20 authors overall

top_authors <- author_counts %>% top_n(20, count)

ggplot(top_authors, aes(x = reorder(author, count), y = count, fill = count)) +
  geom_bar(stat = "identity") + 
  coord_flip() + 
  scale_fill_gradient(low = "blue", high = "red") + 
  labs(title = "Top 20 Authors on NYT Bestseller List", 
       x = "Author", 
       y = "Number of Appearances") + 
  theme_minimal(base_size = 14)

Top 10 Authors in Each Category (not excluding if multiple authors have same count)

# Load necessary libraries
library(ggplot2)
library(dplyr)

# Load dataset
nyt_authors <- read.csv("dataclean_updated.csv", stringsAsFactors = FALSE)

# Count occurrences of each author within each new category
author_counts <- nyt_authors %>% 
  group_by(New_Category, author) %>% 
  summarise(count = n(), .groups = "drop") %>% 
  arrange(New_Category, desc(count))

# Select top 20 authors per category
top_authors <- author_counts %>%
  group_by(New_Category) %>%
  slice_max(order_by = count, n = 10)

# Create bar chart with facet wrap
ggplot(top_authors, aes(x = reorder(author, count), y = count, fill = count)) +
  geom_bar(stat = "identity") +
  coord_flip() +  # Flip for readability
  scale_fill_gradient(low = "blue", high = "red") +  # Color gradient
  facet_wrap(~ New_Category, scales = "free_y") +  # Separate plots per category
  labs(title = "Top 10 Authors in Each Category",
       x = "Author",
       y = "Number of Appearances",
       fill = "Count") +
  theme_minimal(base_size = 13) +
  theme(axis.text.y = element_text(size = 12)) +
  theme(axis.text.x = element_text(size = 12))

Using with_ties = FALSE to ensure exactly 10 authors per category

# Load necessary libraries
library(ggplot2)
library(dplyr)
library(forcats) 

# Load dataset
nyt_authors <- read.csv("dataclean_updated.csv", stringsAsFactors = FALSE)

# Count occurrences of each author within each new category
author_counts <- nyt_authors %>% 
  group_by(New_Category, author) %>% 
  summarise(count = n(), .groups = "drop") %>% 
  arrange(New_Category, desc(count))

# Select top 10 authors per category, ensuring exactly 10 per group
top_authors <- author_counts %>%
  group_by(New_Category) %>%
  slice_max(order_by = count, n = 10, with_ties = FALSE) %>%
  ungroup()  # Remove grouping for plotting

# Create bar chart with facet wrap, ensuring correct ordering within facets
ggplot(top_authors, aes(x = forcats::fct_reorder(author, count, .desc = TRUE), y = count, fill = count)) +
  geom_bar(stat = "identity") +
  coord_flip() +  # Flip for readability
  scale_fill_gradient(low = "blue", high = "red") +  # Color gradient
  facet_wrap(~ New_Category, scales = "free_y") +  # Separate plots per category
  labs(title = "Top 10 Authors in Each Category",
       x = "Author",
       y = "Number of Appearances",
       fill = "Count") +
  theme_minimal(base_size = 13) +
  theme(axis.text.y = element_text(size = 12)) +
  theme(axis.text.x = element_text(size = 12))

James Patterson and Danielle Steel appears in multiple categories as a top author.

# Load necessary libraries
library(dplyr)
library(ggplot2)
library(forcats)

# Load dataset
nyt_authors <- read.csv("dataclean_updated.csv", stringsAsFactors = FALSE)

# Count occurrences of each author within each category
author_counts <- nyt_authors %>% 
  group_by(New_Category, author) %>% 
  summarise(count = n(), .groups = "drop") %>% 
  arrange(New_Category, desc(count))

# Identify authors appearing in more than one category and rank by total count
multi_category_authors <- author_counts %>%
  group_by(author) %>%
  summarise(category_count = n(), total_count = sum(count)) %>%
  filter(category_count > 1) %>%
  arrange(desc(total_count)) %>%
  slice_max(order_by = total_count, n = 10)  # Select top 10 multi-category authors


print(multi_category_authors)
## # A tibble: 11 × 3
##    author                            category_count total_count
##    <chr>                                      <int>       <int>
##  1 Danielle Steel                                 3         113
##  2 David Baldacci                                 3         100
##  3 Nora Roberts                                   2          80
##  4 John Grisham                                   4          74
##  5 James Patterson                                5          68
##  6 Debbie Macomber                                2          67
##  7 James Patterson and Maxine Paetro              3          63
##  8 John Sandford                                  2          58
##  9 Janet Evanovich                                2          55
## 10 Christine Feehan                               2          54
## 11 Lee Child                                      2          54

Top 11 authors (since Christine Feehan and Lee Child both have 54 books in bestseller list) who have bestsellers in more than 1 category.